In [2]:
import warnings
warnings.filterwarnings('ignore') 
import pandas as pd
import numpy as  np
import matplotlib.pyplot as plt
import seaborn as sns
In [3]:
dataset=pd.read_csv("Main_proj_sale_dataset.csv")
dataset.head()
Out[3]:
index Order ID Date Status Fulfilment Sales Channel ship-service-level Style SKU Category ... currency Amount ship-city ship-state ship-postal-code ship-country promotion-ids B2B fulfilled-by Unnamed: 22
0 0 405-8078784-5731545 04-30-22 Cancelled Merchant Amazon.in Standard SET389 SET389-KR-NP-S Set ... INR 647.62 MUMBAI MAHARASHTRA 400081.0 IN NaN False Easy Ship NaN
1 1 171-9198151-1101146 04-30-22 Shipped - Delivered to Buyer Merchant Amazon.in Standard JNE3781 JNE3781-KR-XXXL kurta ... INR 406.00 BENGALURU KARNATAKA 560085.0 IN Amazon PLCC Free-Financing Universal Merchant ... False Easy Ship NaN
2 2 404-0687676-7273146 04-30-22 Shipped Amazon Amazon.in Expedited JNE3371 JNE3371-KR-XL kurta ... INR 329.00 NAVI MUMBAI MAHARASHTRA 410210.0 IN IN Core Free Shipping 2015/04/08 23-48-5-108 True NaN NaN
3 3 403-9615377-8133951 04-30-22 Cancelled Merchant Amazon.in Standard J0341 J0341-DR-L Western Dress ... INR 753.33 PUDUCHERRY PUDUCHERRY 605008.0 IN NaN False Easy Ship NaN
4 4 407-1069790-7240320 04-30-22 Shipped Amazon Amazon.in Expedited JNE3671 JNE3671-TU-XXXL Top ... INR 574.00 CHENNAI TAMIL NADU 600073.0 IN NaN False NaN NaN

5 rows × 24 columns

In [4]:
dataset.columns.tolist()
Out[4]:
['index',
 'Order ID',
 'Date',
 'Status',
 'Fulfilment',
 'Sales Channel ',
 'ship-service-level',
 'Style',
 'SKU',
 'Category',
 'Size',
 'ASIN',
 'Courier Status',
 'Qty',
 'currency',
 'Amount',
 'ship-city',
 'ship-state',
 'ship-postal-code',
 'ship-country',
 'promotion-ids',
 'B2B',
 'fulfilled-by',
 'Unnamed: 22']
In [5]:
print("Rows=",dataset.shape[0])
print("Columns=",dataset.shape[1])
Rows= 128975
Columns= 24
In [6]:
dataset.isnull().sum()
Out[6]:
index                     0
Order ID                  0
Date                      0
Status                    0
Fulfilment                0
Sales Channel             0
ship-service-level        0
Style                     0
SKU                       0
Category                  0
Size                      0
ASIN                      0
Courier Status         6872
Qty                       0
currency               7795
Amount                 7795
ship-city                33
ship-state               33
ship-postal-code         33
ship-country             33
promotion-ids         49153
B2B                       0
fulfilled-by          89698
Unnamed: 22           49050
dtype: int64
In [7]:
# calculating the percentage of null values

dataset.isnull().sum() / len(dataset) * 100
Out[7]:
index                  0.000000
Order ID               0.000000
Date                   0.000000
Status                 0.000000
Fulfilment             0.000000
Sales Channel          0.000000
ship-service-level     0.000000
Style                  0.000000
SKU                    0.000000
Category               0.000000
Size                   0.000000
ASIN                   0.000000
Courier Status         5.328164
Qty                    0.000000
currency               6.043807
Amount                 6.043807
ship-city              0.025586
ship-state             0.025586
ship-postal-code       0.025586
ship-country           0.025586
promotion-ids         38.110487
B2B                    0.000000
fulfilled-by          69.546811
Unnamed: 22           38.030626
dtype: float64
In [8]:
# Low null values (<5%):
# Columns: ship-city, ship-state, ship-postal-code, ship-country.
# categorical columns  ship-city, ship-state, ship-postal-code, ship-country.

ds_copy=dataset.copy()

ds_copy[['ship-city', 'ship-state', 'ship-postal-code', 'ship-country']].head(5)
Out[8]:
ship-city ship-state ship-postal-code ship-country
0 MUMBAI MAHARASHTRA 400081.0 IN
1 BENGALURU KARNATAKA 560085.0 IN
2 NAVI MUMBAI MAHARASHTRA 410210.0 IN
3 PUDUCHERRY PUDUCHERRY 605008.0 IN
4 CHENNAI TAMIL NADU 600073.0 IN
In [9]:
# ship-city column Action: Impute using mode 

# Find mode of the column
city_mode=ds_copy['ship-city'].mode()[0]
print("mode value for ship-city column is :",city_mode)

# Fill missing values with the mode
ds_copy['ship-city']=ds_copy['ship-city'].fillna(city_mode)

# Verify if nulls remain
ds_copy['ship-city'].isnull().sum()
mode value for ship-city column is : BENGALURU
Out[9]:
0
In [10]:
# ship-state column Action: Impute using mode 

# Find mode of the column
ship_mode=ds_copy['ship-state'].mode()[0]
print("mode value for ship-state column is :",ship_mode)

# Fill missing values with the mode
ds_copy['ship-state']=ds_copy['ship-state'].fillna(ship_mode)

# Verify if nulls remain
ds_copy['ship-state'].isnull().sum()
mode value for ship-state column is : MAHARASHTRA
Out[10]:
0
In [11]:
# ship-postal-code Action: Impute using mode 

# Find mode of the column
postal_mode=ds_copy['ship-postal-code'].mode()[0]
print("mode value for  ship-postal-code column is :",postal_mode,type(postal_mode))

# Fill missing values with the mode
ds_copy['ship-postal-code']=ds_copy['ship-postal-code'].fillna(postal_mode)

# Verify if nulls remain
ds_copy['ship-postal-code'].isnull().sum()
mode value for  ship-postal-code column is : 201301.0 <class 'numpy.float64'>
Out[11]:
0
In [12]:
# Remove the decimal part in ship-postal-code column
ds_copy['ship-postal-code']=(ds_copy['ship-postal-code']).astype(int)
ds_copy['ship-postal-code'].mode()[0]
Out[12]:
201301
In [13]:
# ship-country Action: Impute using mode 

# Find mode of the column
country_mode=ds_copy['ship-country'].mode()[0]
print("mode value for ship-country column is :",country_mode)

# Fill missing values with the mode
ds_copy['ship-country']=ds_copy['ship-country'].fillna(country_mode)

# Verify if nulls remain
ds_copy['ship-country'].isnull().sum()
mode value for ship-country column is : IN
Out[13]:
0
In [14]:
# Moderate null values (5%-20%):
# columns: Courier Status, currency, Amount.

ds_copy[['Courier Status', 'currency', 'Amount']].head(5)
 
Out[14]:
Courier Status currency Amount
0 NaN INR 647.62
1 Shipped INR 406.00
2 Shipped INR 329.00
3 NaN INR 753.33
4 Shipped INR 574.00
In [15]:
ds_copy['Courier Status'].unique().tolist()
Out[15]:
[nan, 'Shipped', 'Cancelled', 'Unshipped']
In [16]:
ds_copy['Status'].unique().tolist()
Out[16]:
['Cancelled',
 'Shipped - Delivered to Buyer',
 'Shipped',
 'Shipped - Returned to Seller',
 'Shipped - Rejected by Buyer',
 'Shipped - Lost in Transit',
 'Shipped - Out for Delivery',
 'Shipped - Returning to Seller',
 'Shipped - Picked Up',
 'Pending',
 'Pending - Waiting for Pick Up',
 'Shipped - Damaged',
 'Shipping']
In [17]:
# Filter rows where 'Courier Status' is NaN and select the relevant columns
only_nulls = ds_copy[ds_copy['Courier Status'].isna()][['Status', 'Courier Status']]

# Display the first 5 rows of the filtered data
print("The values of status column and Courier status column ",only_nulls.head(5))

# Count rows with 'Cancelled' in the 'Status' column 
only_nulls_cn= only_nulls[only_nulls['Status'] == 'Cancelled' ][['Courier Status', 'Status']]
print("No of rows has cancelled value in Status column",only_nulls_cn.shape[0])

# Count rows with null values in the 'Courier Status' column
print("No of rows has Null value in Courier status column",ds_copy['Courier Status'].isnull().sum())

# Done this step to avoid invalid imputation
# Both values are equal so  Courier Status column action is to impute the null values as cancelled value

# Impute missing values in 'Courier Status' with 'Cancelled'
ds_copy['Courier Status']=ds_copy['Courier Status'].fillna('Cancelled')

# Verify if nulls remain
print("Null values count in Courier status column",ds_copy['Courier Status'].isnull().sum())
The values of status column and Courier status column         Status Courier Status
0   Cancelled            NaN
3   Cancelled            NaN
23  Cancelled            NaN
29  Cancelled            NaN
83  Cancelled            NaN
No of rows has cancelled value in Status column 6861
No of rows has Null value in Courier status column 6872
Null values count in Courier status column 0
In [18]:
ds_copy['currency'].isnull().sum()
Out[18]:
7795
In [19]:
# High null values (>50%): promotion-ids, fulfilled-by, Unnamed: 22.
# Unnamed: 22. is unnesscary colun so it is dropped

ds_copy.drop('Unnamed: 22',axis='columns',inplace=True)
ds_copy.columns.tolist()
Out[19]:
['index',
 'Order ID',
 'Date',
 'Status',
 'Fulfilment',
 'Sales Channel ',
 'ship-service-level',
 'Style',
 'SKU',
 'Category',
 'Size',
 'ASIN',
 'Courier Status',
 'Qty',
 'currency',
 'Amount',
 'ship-city',
 'ship-state',
 'ship-postal-code',
 'ship-country',
 'promotion-ids',
 'B2B',
 'fulfilled-by']
In [20]:
print(ds_copy['promotion-ids'].mode()[0])
print(ds_copy['fulfilled-by'].mode()[0])
IN Core Free Shipping 2015/04/08 23-48-5-108
Easy Ship
In [21]:
# promotion-ids column Action: Impute using mode 

# Find mode of the column
promo_mode=ds_copy['promotion-ids'].mode()[0]
print("mode value for promotion-ids column is :",promo_mode)

# Fill missing values with the mode
ds_copy['promotion-ids']=ds_copy['promotion-ids'].fillna(promo_mode)

# Verify if nulls remain
ds_copy['promotion-ids'].isnull().sum()
mode value for promotion-ids column is : IN Core Free Shipping 2015/04/08 23-48-5-108
Out[21]:
0
In [22]:
# fulfilled-by column Action: Impute using mode 

# Find mode of the column
ful_mode=ds_copy['fulfilled-by'].mode()[0]
print("mode value for fulfilled-by column is :",ful_mode)

# Fill missing values with the mode
ds_copy['fulfilled-by']=ds_copy['fulfilled-by'].fillna(ful_mode)

# Verify if nulls remain
ds_copy['fulfilled-by'].isnull().sum()
mode value for fulfilled-by column is : Easy Ship
Out[22]:
0
In [23]:
ds_copy.isnull().sum()
Out[23]:
index                    0
Order ID                 0
Date                     0
Status                   0
Fulfilment               0
Sales Channel            0
ship-service-level       0
Style                    0
SKU                      0
Category                 0
Size                     0
ASIN                     0
Courier Status           0
Qty                      0
currency              7795
Amount                7795
ship-city                0
ship-state               0
ship-postal-code         0
ship-country             0
promotion-ids            0
B2B                      0
fulfilled-by             0
dtype: int64
In [24]:
only_nulls = ds_copy[ds_copy['Amount'].isna()][['Status', 'Qty' , 'Amount']]
only_nulls.head(5)
Out[24]:
Status Qty Amount
8 Cancelled 0 NaN
29 Cancelled 0 NaN
65 Cancelled 0 NaN
84 Cancelled 0 NaN
95 Cancelled 0 NaN
In [25]:
# Filter rows where 'Amount' is NaN and select the relevant columns
only_nulls = ds_copy[ds_copy['Amount'].isna()][['Courier Status', 'Qty' , 'Amount']]

# Display the first 5 rows of the filtered data
print("The values of Amt,Qty,courier Status column ",only_nulls.head(5))

only_nulls_st= only_nulls[only_nulls['Courier Status'] == 'Cancelled' ][['Courier Status', 'Qty' , 'Amount']]
print("No of rows has cancelled value in courier status column",only_nulls_st.shape[0])

only_nulls_un= only_nulls[only_nulls['Courier Status'] == 'Unshipped' ][['Courier Status', 'Qty' , 'Amount']]
print("The values of Amt,Qty,Status column ",only_nulls_un.head(5))
print("No of rows has cancelled value in courier status column",only_nulls_un.shape[0])

# Done this step to avoid invalid imputation
# Both courier status values are equal to null in amount column so  amount column action is to impute zero

# Impute missing values in 'Amount' with zero
ds_copy['Amount']=ds_copy['Amount'].fillna(0)

# Verify if nulls remain
print("Null values count in Amount column",ds_copy['Amount'].isnull().sum())
The values of Amt,Qty,courier Status column     Courier Status  Qty  Amount
8       Cancelled    0     NaN
29      Cancelled    0     NaN
65      Cancelled    0     NaN
84      Cancelled    0     NaN
95      Cancelled    0     NaN
No of rows has cancelled value in courier status column 7671
The values of Amt,Qty,Status column       Courier Status  Qty  Amount
937       Unshipped   15     NaN
1057      Unshipped    2     NaN
3631      Unshipped    9     NaN
3632      Unshipped    1     NaN
3634      Unshipped    1     NaN
No of rows has cancelled value in courier status column 124
Null values count in Amount column 0
In [26]:
#check the unique values 

print("unique values of Style:",len(ds_copy['Style'].unique().tolist()))
print("unique values of SKU:",len(ds_copy['SKU'].unique().tolist()))
print("unique values of ASIN:",len(ds_copy['ASIN'].unique().tolist()))
print("unique values of Courier Status:",len(ds_copy['Courier Status'].unique().tolist()))
print("unique values of B2B:",len(ds_copy['B2B'].unique().tolist()))
print("unique values of promotion-ids:",len(ds_copy['promotion-ids'].unique().tolist()))
print("unique values of size:",len(ds_copy['Size'].unique().tolist()))
unique values of Style: 1377
unique values of SKU: 7195
unique values of ASIN: 7190
unique values of Courier Status: 3
unique values of B2B: 2
unique values of promotion-ids: 5787
unique values of size: 11
In [27]:
ds_copy['Size'].unique().tolist()
Out[27]:
['S', '3XL', 'XL', 'L', 'XXL', 'XS', '6XL', 'M', '4XL', '5XL', 'Free']
In [28]:
# Define size mapping for aggregation
size_mapping = {
    'XS': 'Small',
    'S': 'Small', 
    'M': 'Medium', 
    'L': 'Medium', 
    'XL': 'Medium',
    'XXL': 'Large', 
    '3XL': 'Large', 
    '4XL': 'Large', 
    '5XL': 'Large', 
    '6XL': 'Large',
    'Free': 'Large'
}

# Apply mapping to the Size column
ds_copy['Size_Aggregated'] = ds_copy['Size'].map(size_mapping)

# Verify the transformation
print(ds_copy[['Size', 'Size_Aggregated']].head())
  Size Size_Aggregated
0    S           Small
1  3XL           Large
2   XL          Medium
3    L          Medium
4  3XL           Large
In [29]:
ds_copy['Size_Aggregated'].unique()
Out[29]:
array(['Small', 'Large', 'Medium'], dtype=object)
In [30]:
# Unnamed: 22 column is dropped

# ds_copy.drop('Unnamed: 22', axis=1,inplace=True)
In [31]:
ds_copy['Size_Aggregated'].value_counts()
Out[31]:
Size_Aggregated
Medium    65719
Large     35005
Small     28251
Name: count, dtype: int64
In [32]:
# promotion-ids column Action: Impute using mode 

# Find mode of the column
ful_mode=ds_copy['promotion-ids'].mode()[0]
print("mode value for promotion-ids column is :",ful_mode)

# Fill missing values with the mode
ds_copy['promotion-ids']=ds_copy['promotion-ids'].fillna(ful_mode)

# Verify if nulls remain
ds_copy['promotion-ids'].isnull().sum()
mode value for promotion-ids column is : IN Core Free Shipping 2015/04/08 23-48-5-108
Out[32]:
0
In [33]:
# check the unique value of promotion _ids

unique_promotion_ids = ds_copy['promotion-ids'].unique()
print(f"Number of unique promotion IDs: {len(unique_promotion_ids)}")
Number of unique promotion IDs: 5787
In [34]:
ds_copy['Promotion_IDs_Frequency'] = ds_copy['promotion-ids'].map(ds_copy['promotion-ids'].value_counts())

# Check results
print(ds_copy[['promotion-ids', 'Promotion_IDs_Frequency']].head())
                                       promotion-ids  Promotion_IDs_Frequency
0       IN Core Free Shipping 2015/04/08 23-48-5-108                    95253
1  Amazon PLCC Free-Financing Universal Merchant ...                        1
2       IN Core Free Shipping 2015/04/08 23-48-5-108                    95253
3       IN Core Free Shipping 2015/04/08 23-48-5-108                    95253
4       IN Core Free Shipping 2015/04/08 23-48-5-108                    95253
In [35]:
sns.histplot(ds_copy['Promotion_IDs_Frequency'], bins=10, kde=True)
Out[35]:
<Axes: xlabel='Promotion_IDs_Frequency', ylabel='Count'>
In [36]:
# Combine columns into one, converting each to a string
ds_copy['Ship_Location'] = (ds_copy['ship-city'].astype(str) + '_' + 
                            ds_copy['ship-state'].astype(str) + '_' + 
                            ds_copy['ship-postal-code'].astype(str) + '_' + 
                            ds_copy['ship-country'].astype(str))

# Check results
print(ds_copy[['ship-city', 'ship-state', 'ship-postal-code', 'ship-country', 'Ship_Location']].head())
     ship-city   ship-state  ship-postal-code ship-country  \
0       MUMBAI  MAHARASHTRA            400081           IN   
1    BENGALURU    KARNATAKA            560085           IN   
2  NAVI MUMBAI  MAHARASHTRA            410210           IN   
3   PUDUCHERRY   PUDUCHERRY            605008           IN   
4      CHENNAI   TAMIL NADU            600073           IN   

                       Ship_Location  
0       MUMBAI_MAHARASHTRA_400081_IN  
1      BENGALURU_KARNATAKA_560085_IN  
2  NAVI MUMBAI_MAHARASHTRA_410210_IN  
3    PUDUCHERRY_PUDUCHERRY_605008_IN  
4       CHENNAI_TAMIL NADU_600073_IN  
In [37]:
# Frequency Encoding for Shipping_Location
ds_copy['Ship_Location_Frequency'] = ds_copy['Ship_Location'].map(ds_copy['Ship_Location'].value_counts())

# Check the result
print(ds_copy[['Ship_Location', 'Ship_Location_Frequency']].head())
                       Ship_Location  Ship_Location_Frequency
0       MUMBAI_MAHARASHTRA_400081_IN                      111
1      BENGALURU_KARNATAKA_560085_IN                      156
2  NAVI MUMBAI_MAHARASHTRA_410210_IN                      184
3    PUDUCHERRY_PUDUCHERRY_605008_IN                       42
4       CHENNAI_TAMIL NADU_600073_IN                      172
In [38]:
ds_copy.columns.tolist()
Out[38]:
['index',
 'Order ID',
 'Date',
 'Status',
 'Fulfilment',
 'Sales Channel ',
 'ship-service-level',
 'Style',
 'SKU',
 'Category',
 'Size',
 'ASIN',
 'Courier Status',
 'Qty',
 'currency',
 'Amount',
 'ship-city',
 'ship-state',
 'ship-postal-code',
 'ship-country',
 'promotion-ids',
 'B2B',
 'fulfilled-by',
 'Size_Aggregated',
 'Promotion_IDs_Frequency',
 'Ship_Location',
 'Ship_Location_Frequency']
In [39]:
 ds_copy['Ship_Location_Frequency'].head()
Out[39]:
0    111
1    156
2    184
3     42
4    172
Name: Ship_Location_Frequency, dtype: int64
In [40]:
# EDA PArt

sns.histplot(ds_copy['Amount'], kde=True, bins=30)
plt.title('Distribution of Amount (Prices)')
plt.show()
In [41]:
ds_copy['Log_Amount'] = np.log1p(ds_copy['Amount'])
sns.histplot(ds_copy['Log_Amount'], kde=True, bins=30)
plt.title('Log-Transformed Prices')
plt.show()
In [42]:
print(ds_copy[ds_copy['Amount'] <= 1].shape[0])  # Count of values <= 1
print(ds_copy.shape[0])

#  Replace Zeros with a Small Constant Before Transformation
# Since log transformations cannot handle zeros (log(0) is undefined), 
# replacing zeros with a small constant can stabilize the transformation:
10138
128975
In [43]:
# Add a small constant to handle zero values before log transformation
ds_copy['Log_Amount_Adjusted'] = np.log1p(ds_copy['Amount'] + 1e-6)
ds_copy['Log_Amount_Adjusted'].head()
Out[43]:
0    6.474847
1    6.008813
2    5.799093
3    6.625830
4    6.354370
Name: Log_Amount_Adjusted, dtype: float64
In [44]:
ds_copy['Amount_Category'] = ds_copy['Amount'].apply(lambda x: 'Zero' if x == 0 else 'Non-Zero')
ds_copy['Amount_Category'].value_counts()
Out[44]:
Amount_Category
Non-Zero    118837
Zero         10138
Name: count, dtype: int64
In [45]:
sns.boxplot(x=ds_copy['Amount'])
plt.title('Boxplot of Amount')
plt.show()
In [46]:
# Winsorizing the outliers in amount

lower_bound = ds_copy['Amount'].quantile(0.05)  
upper_bound = ds_copy['Amount'].quantile(0.95)  

# Winsorize the 'Amount' column without using clip or lambda
ds_copy['Amount'] = ds_copy['Amount'].where(ds_copy['Amount'] >= lower_bound, lower_bound)
ds_copy['Amount'] = ds_copy['Amount'].where(ds_copy['Amount'] <= upper_bound, upper_bound)
In [47]:
sns.boxplot(x=ds_copy['Amount'])
plt.title('Boxplot of Amount')
plt.show()
In [48]:
# Analyze Amount_Category
# Separate analysis for Non-Zero and Zero categories in the Amount_Category column:
# Check the proportion:

ds_copy['Amount_Category'].value_counts(normalize=True) * 100
Out[48]:
Amount_Category
Non-Zero    92.139562
Zero         7.860438
Name: proportion, dtype: float64
In [49]:
# analyze the pattern
zero_category = ds_copy[ds_copy['Amount_Category'] == 'Zero']
non_zero_category = ds_copy[ds_copy['Amount_Category'] == 'Non-Zero']

# Check features correlated with zero amounts:
#zero_category.describe()
non_zero_category.describe()
Out[49]:
index Qty Amount ship-postal-code Promotion_IDs_Frequency Ship_Location_Frequency Log_Amount Log_Amount_Adjusted
count 118837.000000 118837.000000 118837.000000 118837.000000 118837.000000 118837.000000 118837.000000 118837.000000
mean 64452.670229 0.960467 652.090141 464133.377803 68867.052745 81.768776 6.419550 6.419550
std 37211.968633 0.216387 243.646091 191142.408316 42609.965810 123.117205 0.388115 0.388115
min 0.000000 0.000000 199.000000 110001.000000 1.000000 1.000000 5.298317 5.298317
25% 32260.000000 1.000000 458.000000 382449.000000 146.000000 8.000000 6.129050 6.129050
50% 64410.000000 1.000000 622.000000 500034.000000 95253.000000 34.000000 6.434547 6.434547
75% 96632.000000 1.000000 788.000000 600023.000000 95253.000000 98.000000 6.670766 6.670766
max 128974.000000 8.000000 1166.000000 989898.000000 95253.000000 821.000000 8.627840 8.627840
In [50]:
ds_copy['Ship_Location_Frequency']
Out[50]:
0         111
1         156
2         184
3          42
4         172
         ... 
128970    111
128971    148
128972    393
128973      1
128974      1
Name: Ship_Location_Frequency, Length: 128975, dtype: int64
In [51]:
# Date base feature

ds_copy['Date'] = pd.to_datetime(ds_copy['Date'])

# Extract date-based features
ds_copy['Year'] = ds_copy['Date'].dt.year
ds_copy['Month'] = ds_copy['Date'].dt.month
ds_copy['Day'] = ds_copy['Date'].dt.day
ds_copy['Day_of_Week'] = ds_copy['Date'].dt.dayofweek
ds_copy['Is_Weekend'] = ds_copy['Day_of_Week'].apply(lambda x: 1 if x >= 5 else 0)

print(ds_copy[['Year','Month','Day','Day_of_Week','Is_Weekend']].head())
   Year  Month  Day  Day_of_Week  Is_Weekend
0  2022      4   30            5           1
1  2022      4   30            5           1
2  2022      4   30            5           1
3  2022      4   30            5           1
4  2022      4   30            5           1
In [52]:
# Split 'promotion-ids' by commas and count the number of promotions for each order
ds_copy['num_promotions'] = ds_copy['promotion-ids'].apply(lambda x: len(x.split(',')) if x != '' else 0)

# Display the result
print(ds_copy[['Order ID', 'promotion-ids', 'num_promotions']].head())
              Order ID                                      promotion-ids  \
0  405-8078784-5731545       IN Core Free Shipping 2015/04/08 23-48-5-108   
1  171-9198151-1101146  Amazon PLCC Free-Financing Universal Merchant ...   
2  404-0687676-7273146       IN Core Free Shipping 2015/04/08 23-48-5-108   
3  403-9615377-8133951       IN Core Free Shipping 2015/04/08 23-48-5-108   
4  407-1069790-7240320       IN Core Free Shipping 2015/04/08 23-48-5-108   

   num_promotions  
0               1  
1              25  
2               1  
3               1  
4               1  
In [53]:
ds_copy['High_Promotion'] = ds_copy['num_promotions'].apply(lambda x: 1 if x > 3 else 0)
ds_copy['High_Promotion'].head()
Out[53]:
0    0
1    1
2    0
3    0
4    0
Name: High_Promotion, dtype: int64
In [54]:
# Group by 'Category' and calculate the total number of promotions for each category
category_promotions = ds_copy.groupby('Category')['num_promotions'].sum()

print(category_promotions)
Category
Blouse             4492
Bottom             3529
Dupatta               3
Ethnic Dress       6163
Saree               677
Set              289997
Top               54070
Western Dress    133829
kurta            273435
Name: num_promotions, dtype: int64
In [55]:
# Group by 'ship-country' and calculate the total number of promotions for each country
country_promotions = ds_copy.groupby('ship-country')['num_promotions'].sum()

# Display the result
print(country_promotions)
ship-country
IN    766195
Name: num_promotions, dtype: int64
In [56]:
# Remove rows where Qty is zero
ds_copy = ds_copy[ds_copy['Qty'] != 0]
In [57]:
ds_copy['Price_Per_Unit'] = ds_copy['Amount'] / ds_copy['Qty']
ds_copy['Price_Per_Unit'].head()
Out[57]:
1    406.0
2    329.0
4    574.0
5    824.0
6    653.0
Name: Price_Per_Unit, dtype: float64
In [58]:
# Plot the distribution of the 'Price_Per_Unit'


sns.histplot(ds_copy['Price_Per_Unit'], kde=True)
plt.title('Distribution of Price Per Unit')
plt.show()
In [59]:
# Check correlation between Quantity and Price per Unit
correlation = ds_copy[['Qty', 'Price_Per_Unit']].corr()
print(correlation)


# A correlation of -0.036668 means that there is a very weak negative correlation between Quantity (Qty) and Price Per Unit. 
# While there may be some slight trend where Price Per Unit decreases as Qty increases, it is not a strong or reliable 
# relationship. This suggests that, in your dataset, quantity does not have a significant impact on the price per unit.
                     Qty  Price_Per_Unit
Qty             1.000000       -0.036668
Price_Per_Unit -0.036668        1.000000
In [60]:
correlation = ds_copy[['Qty', 'Amount']].corr()
print(correlation)
#The correlation of 0.051943 suggests that there is a very weak positive correlation between Quantity (Qty) and 
# Amount in your dataset.
#While there is some slight tendency for Amount to increase as Quantity increases, the relationship is not 
# strong enough to rely on it for making significant predictions or insights.
#
             Qty    Amount
Qty     1.000000  0.051943
Amount  0.051943  1.000000
In [61]:
ds_copy.shape[0]
Out[61]:
116168
In [62]:
# Visualizations for Relationships
# Amount vs Promotion_IDs_Frequency
fig1 = px.scatter(ds_copy, x="Promotion_IDs_Frequency", y="Amount", title="Amount vs. Promotion IDs Frequency")
fig1.show()
In [63]:
ds_copy.columns
Out[63]:
Index(['index', 'Order ID', 'Date', 'Status', 'Fulfilment', 'Sales Channel ',
       'ship-service-level', 'Style', 'SKU', 'Category', 'Size', 'ASIN',
       'Courier Status', 'Qty', 'currency', 'Amount', 'ship-city',
       'ship-state', 'ship-postal-code', 'ship-country', 'promotion-ids',
       'B2B', 'fulfilled-by', 'Size_Aggregated', 'Promotion_IDs_Frequency',
       'Ship_Location', 'Ship_Location_Frequency', 'Log_Amount',
       'Log_Amount_Adjusted', 'Amount_Category', 'Year', 'Month', 'Day',
       'Day_of_Week', 'Is_Weekend', 'num_promotions', 'High_Promotion',
       'Price_Per_Unit'],
      dtype='object')
In [64]:
# Visualization 2: Amount vs. Promotion_IDs_Frequency
plt.figure(figsize=(8, 6))
sns.boxplot(data=ds_copy, x='Promotion_IDs_Frequency', y='Amount')
plt.title('Amount vs. Promotion IDs Frequency')
plt.show()
In [65]:
import plotly.express as px

fig1 = px.box(ds_copy, x="Promotion_IDs_Frequency", y="Amount", title="Amount vs. Promotion IDs Frequency")
fig1.show()
In [66]:
ds_copy['Ship_Location_Frequency'].value_counts()
Out[66]:
Ship_Location_Frequency
1      7582
2      5554
3      3953
4      3095
5      2764
       ... 
116     107
120     107
107     105
114     101
102      91
Name: count, Length: 210, dtype: int64
In [67]:
numerical_ds_copy = ds_copy.select_dtypes(include=['number'])

# Calculate the correlation matrix
correlation_matrix = numerical_ds_copy.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5, fmt=".2f")
plt.title("Correlation Matrix for Numerical Columns")
plt.show()
In [68]:
# normalization function to handle case variations and abbreviations

def normalize_state(state):
    state = state.lower()  # Convert to lowercase
    # Define some common corrections
    state_corrections = {
        'delhi': 'delhi', 'new delhi': 'delhi', 'nd': 'delhi',
        'goa': 'goa', 'puducherry': 'puducherry', 'pondicherry': 'puducherry',
        'punjab': 'punjab', 'rajshthan': 'rajasthan', 'rajasthan': 'rajasthan',
        'orissa': 'odisha', 'pb': 'punjab', 'rj': 'rajasthan',
        'bihar': 'bihar', 'jharkhand': 'jharkhand', 'telangana': 'telangana',
        'andhra pradesh': 'andhra pradesh', 'arunachal pradesh': 'arunachal pradesh'
    }
    return state_corrections.get(state, state)

# Normalize the 'ship-state' column 
ds_copy['ship-state'] = ds_copy['ship-state'].apply(normalize_state)

# region mapping (in lowercase)
region_mapping = {
    'south': ['tamil nadu', 'kerala', 'karnataka', 'andhra pradesh'],
    'north': ['delhi', 'haryana', 'punjab', 'uttar pradesh'],
    'east': ['west bengal', 'odisha', 'bihar', 'assam', 'jharkhand', 'meghalaya', 'tripura', 'mizoram', 'nagaland'],
    'west': ['maharashtra', 'gujarat', 'rajasthan', 'goa'],
    'central': ['madhya pradesh', 'chhattisgarh'],
    'other': ['chandigarh', 'lakshadweep', 'andaman & nicobar', 'jammu & kashmir', 'sikkim', 'ladakh']
}

# Function to map states to regions
def map_region(state):
    for region, states in region_mapping.items():
        if state in states:
            return region
    return 'other'  # If state is not in any region, return 'other'

# Apply the region mapping function to the 'ship-state' column 
ds_copy['Region'] = ds_copy['ship-state'].apply(map_region)

# Group data by 'Region' and count the entries in each region
grouped_data = ds_copy.groupby('Region').size()

# Output the grouped data
print(grouped_data)
Region
central     3078
east       12389
north      21621
other      14432
south      36777
west       27871
dtype: int64
In [69]:
ds_copy.columns.tolist()
Out[69]:
['index',
 'Order ID',
 'Date',
 'Status',
 'Fulfilment',
 'Sales Channel ',
 'ship-service-level',
 'Style',
 'SKU',
 'Category',
 'Size',
 'ASIN',
 'Courier Status',
 'Qty',
 'currency',
 'Amount',
 'ship-city',
 'ship-state',
 'ship-postal-code',
 'ship-country',
 'promotion-ids',
 'B2B',
 'fulfilled-by',
 'Size_Aggregated',
 'Promotion_IDs_Frequency',
 'Ship_Location',
 'Ship_Location_Frequency',
 'Log_Amount',
 'Log_Amount_Adjusted',
 'Amount_Category',
 'Year',
 'Month',
 'Day',
 'Day_of_Week',
 'Is_Weekend',
 'num_promotions',
 'High_Promotion',
 'Price_Per_Unit',
 'Region']
In [595]:
ds_copy.Region
Out[595]:
1           south
2            west
4           south
5           north
6           other
           ...   
128970      other
128971      north
128972      other
128973       west
128974    central
Name: Region, Length: 116168, dtype: object
In [597]:
#Tableau visualization

d=ds_copy.copy()
# Only necessary columns selected for Tableau visualizations
columns_needed = [
    'Date',  # For time-based analysis
    'Category',  # For product segmentation
    'Amount',  # Revenue
    'Price_Per_Unit',  # Price point
    'ship-service-level', # shipping service level
    'Status', # courier status
    'Fulfilment', # Fulfillment Method Performance
    'Qty',  # Quantity sold (demand)
    'promotion-ids',  # For promotional analysis
    'num_promotions',  # Number of promotions
    'Region',  # For geographic insights
    'ship-country', 
    'Day_of_Week',  # For seasonal trends
    'Is_Weekend',  # Weekend impact
    'High_Promotion',  # Promotion flags
    'Year',  # Year for trend analysis
    'Month',  # Month for trend analysis
]


filtered_df = d[columns_needed]

# Save the filtered dataset for Tableau
filtered_file_path = 'Dataset_for_tableau_main_proj.csv'
filtered_df.to_csv(filtered_file_path, index=False)

print(f"Filtered dataset saved to {filtered_file_path}")
Filtered dataset saved to Dataset_for_tableau_main_proj.csv
In [70]:
ds_copy.columns
Out[70]:
Index(['index', 'Order ID', 'Date', 'Status', 'Fulfilment', 'Sales Channel ',
       'ship-service-level', 'Style', 'SKU', 'Category', 'Size', 'ASIN',
       'Courier Status', 'Qty', 'currency', 'Amount', 'ship-city',
       'ship-state', 'ship-postal-code', 'ship-country', 'promotion-ids',
       'B2B', 'fulfilled-by', 'Size_Aggregated', 'Promotion_IDs_Frequency',
       'Ship_Location', 'Ship_Location_Frequency', 'Log_Amount',
       'Log_Amount_Adjusted', 'Amount_Category', 'Year', 'Month', 'Day',
       'Day_of_Week', 'Is_Weekend', 'num_promotions', 'High_Promotion',
       'Price_Per_Unit', 'Region'],
      dtype='object')
In [ ]: